import pandas as pd
df = pd.read_csv('master_dataset.csv', delimiter=";", low_memory=False)
df.head()
import numpy as np
df = df.fillna(0)
df = df.replace('other',0)
df = df.replace('unknown',0)
df = df.replace('undefined',0)
df
df['region'] = df['region'].replace({"AF":0,"AS":1,"EU":2,"NA":3,"OC":4,'SA':5})
df['tipo_sangre'] = df['tipo_sangre'].replace({"abn":0,"abp":1,"an":2,"ap":3,"bn":4,"bp":5,'on':6,'op':7})
df['ingresos'] = df['ingresos'].replace({"blank":0,"gov":1,"high":2,"low":3,"med":4})
df['carrera'] = df['carrera'].replace({"asian":0,"black":1,"blank":2,"hispanic":3,"mixed":4,"other":5,"white":6})
df['inmigrante'] = df['inmigrante'].replace({"blank":0,"immigrant":1,"native":2})
df['fuma'] = df['fuma'].replace({"never":0,"quit0":1,"quit10":2, "quit5":3,"vape":4,"yesheavy":5,"yeslight":6,"yesmedium":7})
df['trabajando'] = df['trabajando'].replace({"home":0,"never":1,"stopped":2,"travel critical":3,"travel non critical":4})
df['tasa_reduccion_riesgo_tipo_mascarilla'] = df['tasa_reduccion_riesgo_tipo_mascarilla'].replace({"clothhome":0,"clothstore":1,"level1":2,"level2":3,"level3":4,"na":5,"surgical":6})
df['pais'] =df['pais'].astype('str')
df['sexo'] =df['sexo'].astype('str')
df['seguro'] =df['seguro'].astype('str')
df['prescripcion_medica'] =df['prescripcion_medica'].astype('str')
#Escoger columnas que van hacer procesadas
Xsubset = df[['region','latitud','longitud','exactitud','edad','altura','peso','masa_corporal','tipo_sangre',
'ingresos','carrera','inmigrante','fuma','alcohol','canabis','anfetaminas','cocaina','lsd','extasis',
'cuenta_contactos','cuenta_casa','cuenta_transporte_publico','trabajando','preocupado','tasa_reduccion_riesgo_individual',
'tasa_reduccion_riesgo_unico_distanciamiento_social','tasa_reduccion_riesgo_lavarse_manos',
'tasa_reduccion_riesgo_casa','tasa_reduccion_riesgo_distanciamiento_social_casa',
'tasa_reduccion_riesgo_lavado_manos_casa','tasa_reduccion_riesgo_desinfectante',
'tasa_reduccion_mascarilla','tasa_reduccion_riesgo_tipo_mascarilla','tasa_accion_gobierno','tasa_control_gobierno',
'tasa_gasto_gobierno','opinion_infeccion','opinion_mortalidad','riesgo_infeccion',
'riesgo_mortalidad','pais','sexo','seguro','prescripcion_medica']]
Var_Ordinales=df[['covid19_sintomas','covid19_contacto','asma','nefropatia',
'enfermedad_higado','inmune_coprometida','cardiopatia','enfermedad_pulmonar','diabetes','VIH_positva','hipertension',
'otro_cronico','asilo_ancianos','trabajador_salud','covid19_positivo']]
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer, ColumnTransformer
preprocesador1 = make_column_transformer(
(StandardScaler(),['region','latitud','longitud','exactitud','edad','altura','peso','masa_corporal','tipo_sangre',
'ingresos','carrera','inmigrante','fuma','alcohol','canabis','anfetaminas','cocaina','lsd','extasis',
'cuenta_contactos','cuenta_casa','cuenta_transporte_publico','trabajando','preocupado','tasa_reduccion_riesgo_individual',
'tasa_reduccion_riesgo_unico_distanciamiento_social','tasa_reduccion_riesgo_lavarse_manos',
'tasa_reduccion_riesgo_casa','tasa_reduccion_riesgo_distanciamiento_social_casa',
'tasa_reduccion_riesgo_lavado_manos_casa','tasa_reduccion_riesgo_desinfectante',
'tasa_reduccion_mascarilla','tasa_reduccion_riesgo_tipo_mascarilla','tasa_accion_gobierno','tasa_control_gobierno',
'tasa_gasto_gobierno','opinion_infeccion','opinion_mortalidad','riesgo_infeccion',
'riesgo_mortalidad']),
(OneHotEncoder(),['pais','sexo','seguro','prescripcion_medica'])
)
#Guardar en la variable X los valores transformados
X = preprocesador1.fit_transform(Xsubset)
#Asisnacion de variables categoricas nominales
categorical_features = ['pais','sexo','seguro','prescripcion_medica']
#Asisnacion de variables categoricas numericas
cnamesDataset1 = ['region','latitud','longitud','exactitud','edad','altura','peso','masa_corporal','tipo_sangre',
'ingresos','carrera','inmigrante','fuma','alcohol','canabis','anfetaminas','cocaina','lsd','extasis',
'cuenta_contactos','cuenta_casa','cuenta_transporte_publico','trabajando','preocupado','tasa_reduccion_riesgo_individual',
'tasa_reduccion_riesgo_unico_distanciamiento_social','tasa_reduccion_riesgo_lavarse_manos',
'tasa_reduccion_riesgo_casa','tasa_reduccion_riesgo_distanciamiento_social_casa',
'tasa_reduccion_riesgo_lavado_manos_casa','tasa_reduccion_riesgo_desinfectante',
'tasa_reduccion_mascarilla','tasa_reduccion_riesgo_tipo_mascarilla','tasa_accion_gobierno','tasa_control_gobierno',
'tasa_gasto_gobierno','opinion_infeccion','opinion_mortalidad','riesgo_infeccion',
'riesgo_mortalidad']
cnamesDataset2 = preprocesador1.transformers_[1][1].get_feature_names(categorical_features)
#Agregar las variables categoricas nominales al Dataset
cnamesDataset1.extend(cnamesDataset2)
#Crear el dataset nuevo con las variables procesadas
preprocesadoStandar = pd.DataFrame(data=X,columns=cnamesDataset1)
datasetPreprocesado=pd.concat([preprocesadoStandar,Var_Ordinales],axis=1)
datasetPreprocesado.to_csv("DatasetPreprocesado.csv", sep=";",index = False) #sep es el separado, por defector es ","
datasetPreprocesado.head()
gedf=df.groupby(['edad'])
print('--------- Numero de Grupos con la cantidad de Ususarios en cada Grupo del Dataset Principal ---------')
print('Cantidad de Grupos -->',len(gedf.groups))
print(gedf.size())
gedp=datasetPreprocesado.groupby(['edad'])
print('--------- Numero de Grupos con la cantidad de Ususarios en cada Grupo del Dataset Preprocesado ---------')
print('Cantidad de Grupos -->',len(gedp.groups))
print(gedp.size())
datasetNinez = datasetPreprocesado[(datasetPreprocesado['edad'] == -1.2057169865380815) | (datasetPreprocesado['edad'] == -1.2023210292675681)]
print('Dataset Niñez con ',len(datasetNinez), 'pacientes')
datasetAdolecencia =datasetPreprocesado[(datasetPreprocesado['edad'] == -0.8593293449457281)]
print('Dataset Adolecencia con ',len(datasetAdolecencia), 'pacientes')
datasetJuventud =datasetPreprocesado[(datasetPreprocesado['edad'] == -0.516337660623888)]
print('Dataset Juventud con ',len(datasetJuventud), 'pacientes')
datasetAdultez =datasetPreprocesado[(datasetPreprocesado['edad'] == -0.1733459763020478) | (datasetPreprocesado['edad'] == 0.16964570801979234) | (datasetPreprocesado['edad'] == 0.5126373923416325)]
print('Dataset Adultez con ',len(datasetAdultez), 'pacientes')
datasetPMayor =datasetPreprocesado[(datasetPreprocesado['edad'] == 0.8556290766634727) | (datasetPreprocesado['edad'] == 1.1986207609853128) | (datasetPreprocesado['edad'] == 1.5416124453071531) | (datasetPreprocesado['edad'] == 29.391858020786472) | (datasetPreprocesado['edad'] == 32.79121124857025)]
print('Dataset Persona Mayor con ',len(datasetPMayor), 'pacientes')
datasetNinez.describe()
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrNI=datasetNinez.corrwith(datasetNinez['riesgo_infeccion']).round(10)
XNI = datasetNinez.columns
YNI = corrNI
colormap = plt.cm.gist_ncar
labelsNI = range(1,len(XNI)+1)
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetNinez.columns))]
ax = YNI.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XNI,YNI,labelsNI):
ax.scatter(x,y,label=lab,marker='*')
#nipy_spectral, Set1,Paired
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YNI,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida riesgo_infeccion")
plt.ylabel("Valor correlacion con riesgo_infeccion")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones Niñez con Risk_Infection.jpg", bbox_inches='tight')
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrNM=datasetNinez.corrwith(datasetNinez['riesgo_mortalidad']).round(10)
XNM = datasetNinez.columns
YNM = corrNM
labelsNM = range(1,len(XNM)+1)
plt.figure(figsize=(15,8))
colormap = plt.cm.gist_ncar
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetNinez.columns))]
ax = YNM.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XNM,YNM,labelsNM):
ax.scatter(x,y,label=lab,marker='*')
#----
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YNM,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida riesgo_mortalidad")
plt.ylabel("Valor de Correlacion con rieso_mortalidad")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones Niñez con risk_mortality.jpg", bbox_inches='tight')
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrNC=datasetNinez.corrwith(datasetNinez['covid19_positivo']).round(10)
XNC = datasetNinez.columns
YNC = corrNC
colormap = plt.cm.gist_ncar
labelsNC = range(1,len(XNC)+1)
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetNinez.columns))]
ax = YNC.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XNC,YNC,labelsNC):
ax.scatter(x,y,label=lab,marker='*')
#--
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YNC,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida covid19_positivo")
plt.ylabel("Valor de Correlacion con covid19_positivo")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Niñez con covid19_positive.jpg", bbox_inches='tight')
datasetAdolecencia.describe()
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrAI=datasetAdolecencia.corrwith(datasetAdolecencia['riesgo_infeccion']).round(10)
XAI = datasetAdolecencia.columns
YAI = corrAI
labelsAI = range(1,len(XAI)+1)
colormap = plt.cm.gist_ncar
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetAdolecencia.columns))]
ax = YAI.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XAI,YAI,labelsAI):
ax.scatter(x,y,label=lab,marker='*')
#
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YAI,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida riesgo_infeccion")
plt.ylabel("Valor de Correlacion con riesgo_infeccion")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Adolecencia con Risk_Infection.jpg", bbox_inches='tight')
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrAM=datasetAdolecencia.corrwith(datasetAdolecencia['riesgo_mortalidad']).round(10)
XAM = datasetAdolecencia.columns
YAM = corrAM
labelsAM = range(1,len(XAM)+1)
colormap = plt.cm.gist_ncar
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetAdolecencia.columns))]
ax = YAM.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XAM,YAM,labelsAM):
ax.scatter(x,y,label=lab,marker='*')
#
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YAM,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida riesgo_mortalidad")
plt.ylabel("Valor de Correlacion con riesgo_mortalidad")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Adolecencia con risk_mortality.jpg", bbox_inches='tight')
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrAC=datasetAdolecencia.corrwith(datasetAdolecencia['covid19_positivo']).round(10)
XAC = datasetAdolecencia.columns
YAC = corrAC
colormap = plt.cm.gist_ncar
labelsAC = range(1,len(XAC)+1)
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetAdolecencia.columns))]
ax = YAC.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XAC,YAC,labelsAC):
ax.scatter(x,y,label=lab,marker='*')
#
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YAC,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida covid19_positivo")
plt.ylabel("Valor de Correlacion con covid19_positivo")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Adolecencia con covid19_positive.jpg", bbox_inches='tight')
datasetJuventud.describe()
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrJI=datasetJuventud.corrwith(datasetJuventud['riesgo_infeccion']).round(10)
XJI = datasetJuventud.columns
YJI = corrJI
colormap = plt.cm.gist_ncar
labelsJI = range(1,len(XJI)+1)
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetJuventud.columns))]
ax = YJI.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XJI,YJI,labelsJI):
ax.scatter(x,y,label=lab,marker='*')
#nipy_spectral, Set1,Paired
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YJI,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida riesgo_infeccion")
plt.ylabel("Valor de Correlacion con riesgo_infeccion")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Juventud con Risk_Infection.jpg", bbox_inches='tight')
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrJM=datasetJuventud.corrwith(datasetJuventud['riesgo_mortalidad']).round(10)
XJM = datasetJuventud.columns
YJM = corrJM
labelsJM = range(1,len(XJM)+1)
plt.figure(figsize=(15,8))
colormap = plt.cm.gist_ncar
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetJuventud.columns))]
ax = YJM.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XJM,YJM,labelsJM):
ax.scatter(x,y,label=lab,marker='*')
#
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YJM,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida riesgo_mortalidad")
plt.ylabel("Valor de Correlacion con riesgo_mortalidad")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Juventud con risk_mortality.jpg", bbox_inches='tight')
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrJC=datasetJuventud.corrwith(datasetJuventud['covid19_positivo']).round(10)
XJC = datasetJuventud.columns
YJC = corrJC
colormap = plt.cm.gist_ncar
labelsJC = range(1,len(XJC)+1)
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetJuventud.columns))]
ax = YJC.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XJC,YJC,labelsJC):
ax.scatter(x,y,label=lab,marker='*')
#
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YJC,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida covid19_positivo")
plt.ylabel("Valor de Correlacion con covid19_positivo")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Juventud con covid19_positive.jpg", bbox_inches='tight')
datasetAdultez.describe()
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrADI=datasetAdultez.corrwith(datasetAdultez['riesgo_infeccion']).round(10)
XADI = datasetAdultez.columns
YADI = corrADI
colormap = plt.cm.gist_ncar
labelsADI = range(1,len(XADI)+1)
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetAdultez.columns))]
ax = YADI.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XADI,YADI,labelsADI):
ax.scatter(x,y,label=lab,marker='*')
#nipy_spectral, Set1,Paired
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YADI,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida riesgo_infeccion")
plt.ylabel("Valor de Correlacion con riesgo_infeccion")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Adultez con Risk_Infection.jpg", bbox_inches='tight')
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrADM=datasetAdultez.corrwith(datasetAdultez['riesgo_mortalidad']).round(10)
XADM = datasetAdultez.columns
YADM = corrADM
colormap = plt.cm.gist_ncar
labelsADM = range(1,len(XADM)+1)
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetAdultez.columns))]
ax = YADM.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XADM,YADM,labelsADM):
ax.scatter(x,y,label=lab,marker='*')
#
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YADM,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida riesgo_mortalidad")
plt.ylabel("Valor de Correlacion con riesgo_mortalidad")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Adultez con risk_mortality.jpg", bbox_inches='tight')
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrADC=datasetAdultez.corrwith(datasetAdultez['covid19_positivo']).round(10)
XADC = datasetAdultez.columns
YADC = corrADC
colormap = plt.cm.gist_ncar
labelsADC = range(1,len(XADC)+1)
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetAdultez.columns))]
ax = YADC.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XADC,YADC,labelsADC):
ax.scatter(x,y,label=lab,marker='*')
#
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YADC,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida covid19_positivo")
plt.ylabel("Valor de Correlacion con covid19_positivo")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Adultez con covid19_positive.jpg", bbox_inches='tight')
datasetPMayor.describe()
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrPMI=datasetPMayor.corrwith(datasetPMayor['riesgo_infeccion']).round(10)
XPMI = datasetPMayor.columns
YPMI = corrPMI
colormap = plt.cm.gist_ncar
labelsPMI = range(1,len(XPMI)+1)
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetPMayor.columns))]
ax = YPMI.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XPMI,YPMI,labelsPMI):
ax.scatter(x,y,label=lab,marker='*')
#nipy_spectral, Set1,Paired
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YPMI,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida riesgo_infeccion")
plt.ylabel("Valor de Correlacion con riesgo_infeccion")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Personas Mayores con Risk_Infection.jpg", bbox_inches='tight')
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrPMM=datasetPMayor.corrwith(datasetPMayor['riesgo_mortalidad']).round(10)
XPMM = datasetPMayor.columns
YPMM = corrPMM
colormap = plt.cm.gist_ncar
labelsPMM = range(1,len(XPMM)+1)
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetPMayor.columns))]
ax = YPMM.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XPMM,YPMM,labelsPMM):
ax.scatter(x,y,label=lab,marker='*')
#
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YPMM,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida riesgo_mortalidad")
plt.ylabel("Valor de Correlacion con riesgo_mortalidad")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Personas Mayores con risk_mortality.jpg", bbox_inches='tight')
import matplotlib.pyplot as plt
from random import randint
import numpy as np
corrPMC=datasetPMayor.corrwith(datasetPMayor['covid19_positivo']).round(10)
XPMC = datasetPMayor.columns
YPMC = corrPMC
colormap = plt.cm.gist_ncar
labelsPMC = range(1,len(XPMC)+1)
plt.figure(figsize=(15,8))
colorst = [colormap(i) for i in np.linspace(0, 0.9,len(datasetPMayor.columns))]
ax = YPMC.plot(kind='bar',color=colorst)
plt.xticks(rotation = 90)
for x,y,lab in zip(XPMC,YPMC,labelsPMC):
ax.scatter(x,y,label=lab,marker='*')
#
for t,j1 in enumerate(ax.collections):
j1.set_color(colorst[t])
ax.tick_params(axis='x', labelrotation=90)
#ax.legend(YPMC,fontsize='small',bbox_to_anchor=(0, 1), loc='lower left', ncol=8)
plt.title("Correlaciones con la variable de Salida covid19_positivo")
plt.ylabel("Valor de Correlacion con covid19_positivo")
plt.ylim(-1,1)
ax.grid()
plt.savefig("Correlaciones en Personas Mayores con covid19_positive.jpg", bbox_inches='tight')
X = datasetNinez.drop('covid19_positivo',1)
y = datasetNinez['covid19_positivo']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.decomposition import PCA
pca = PCA()
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(PCAX_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(PCAX_test)
y_pred
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy: ' + str(accuracy_score(y_test, y_pred)))
X = datasetAdolecencia.drop('covid19_positivo',1)
y = datasetAdolecencia['covid19_positivo']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.decomposition import PCA
pca = PCA()
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(PCAX_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(PCAX_test)
y_pred
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy: ' + str(accuracy_score(y_test, y_pred)))
X = datasetJuventud.drop('covid19_positivo',1)
y = datasetJuventud['covid19_positivo']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.decomposition import PCA
pca = PCA()
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(PCAX_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(PCAX_test)
y_pred
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy: ' + str(accuracy_score(y_test, y_pred)))
X = datasetAdultez.drop('covid19_positivo',1)
y = datasetAdultez['covid19_positivo']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.decomposition import PCA
pca = PCA()
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(PCAX_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(PCAX_test)
y_pred
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy: ' + str(accuracy_score(y_test, y_pred)))
X = datasetAdultez.drop('covid19_positivo',1)
y = datasetAdultez['covid19_positivo']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.decomposition import PCA
pca = PCA()
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(PCAX_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(PCAX_test)
y_pred
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy: ' + str(accuracy_score(y_test, y_pred)))
from sklearn.decomposition import PCA
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
datasetNinez.drop('covid19_positivo',1)
XN=datasetNinez
n_samples = XN.shape[0]
n_features = XN.shape[1]
print('********** Primera Forma')
num_components = min(n_samples, n_features)
pca = PCA(n_components=.85)
X_transformed = pca.fit_transform(XN)
eigenvalues = pca.explained_variance_
num_componentsSeleccionados = X_transformed.shape[1]
print(num_componentsSeleccionados)
print('Varianzas:')
print(eigenvalues.round(3))
explained_variance_ratio_=pca.explained_variance_ratio_
print('Porcentaje de varianza de cada dimension con respecto a la varianza total:')
print(explained_variance_ratio_.round(3))
print('Porcentaje acumulado de varianza:')
explained_variance_ratio_cumsum=explained_variance_ratio_.cumsum()
print(explained_variance_ratio_cumsum.round(3))
print("El Numero de componentes seleccionados es: ", num_componentsSeleccionados)
num_pc= num_componentsSeleccionados
a = range(1,num_pc+1)
num_pc= a[::1]
plt.figure(figsize=(15,10))
plt.subplot(2, 2, 1)
plt.plot(num_pc, explained_variance_ratio_, color='orange', linestyle='dashed', linewidth = 3,
marker='o', markerfacecolor='blue', markersize=5, label = 'Valor Varianza')
plt.title('Varianza con '+str(num_componentsSeleccionados)+' Componentes Principales')
plt.xlabel('N Componentes Principales')
plt.ylabel('Varianza')
plt.legend(loc="upper right")
plt.grid()
plt.subplot(2, 2, 2)
plt.plot(num_pc, explained_variance_ratio_cumsum, color='orange', linestyle='dashed', linewidth = 3,
marker='o', markerfacecolor='blue', markersize=5, label = 'Valor Varianzas Acumuladas')
plt.title('Varianza con '+str(num_componentsSeleccionados)+' Componenentes Principales')
plt.xlabel('N Componentes Principales')
plt.ylabel('Varianza')
plt.legend(loc="upper left")
plt.grid()
plt.show()
principalDfN = pd.DataFrame(data = X_transformed
, columns = num_pc)
principalDfN.describe()
principalDfN['Salida']=datasetNinez['covid19_positivo'].values
XSN = principalDfN.drop('Salida',1)
ySN = principalDfN['Salida']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XSN, ySN, test_size=0.2, random_state=0)
from sklearn.decomposition import PCA
pca = PCA()
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(PCAX_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(PCAX_test)
y_pred
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy: ' + str(accuracy_score(y_test, y_pred)))
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
PDAaccuracy= []
PDAncomponents=[]
for i in range(1,19):
pca = PCA(n_components=i);
PCAX_train = pca.fit_transform(X_train);
PCAX_test = pca.transform(X_test);
explained_variance = pca.explained_variance_ratio_;
classifier = RandomForestClassifier(max_depth=2, random_state=0);
classifier.fit(PCAX_train, y_train);
y_pred = classifier.predict(PCAX_test);
PDAncomponents.append(i);
cm = confusion_matrix(y_test, y_pred);
PDAaccuracy.append(accuracy_score(y_test, y_pred));
AccuracyMaximo = max(PDAaccuracy, key=float);
numComponente = PDAaccuracy.index(AccuracyMaximo);
print("Num. Componentes = ", numComponente+1, "AcuraccyMaximo = ", AccuracyMaximo)
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn import metrics
from time import time
X=principalDfN
K_range=range(1,19,1)
distortions=[]
timeTotal=[]
for i in K_range:
timeI=time()
kmeanModel = KMeans(n_clusters=i,init='k-means++')
kmeanModel.fit(principalDfN)
distortions.append(kmeanModel.inertia_)
timeF=time()
timeC= timeF - timeI
timeTotal.append(timeC)
fig1=plt.figure()
ex = fig1.add_subplot(111)
ex.plot(K_range, distortions, 'b*-', color='orange')
plt.grid(True)
plt.xlabel('Numeros de clusters')
plt.ylabel('Distorcion de la Media')
plt.title('Eleccion del Mejor K con el Metodo de Elbow')
fig2=plt.figure()
plt.plot(K_range, timeTotal, 'b*-', color="orange")
plt.grid(True)
plt.xlabel('Numero de Clusters')
plt.ylabel('Tiempo de Ejecucion (segundos)')
plt.title('Tiempo de Clusters')
def doKmeans(x, nclust=4, init='k-means++',max_iter=100, tol=0.0001, random_state=10, algorithm='full'):
model = KMeans(nclust)
model.fit(x)
clust_labels = model.predict(x)
cent = model.cluster_centers_
return (clust_labels, cent)
clust_labels, cent = doKmeans(principalDfN, 40, init='k-means++',max_iter=100, tol=0.0001, random_state=10, algorithm='full')
kmeans = pd.DataFrame(clust_labels,columns=['Grupos'])
kmeans
UserGrupoK=kmeans.groupby(kmeans.Grupos).Grupos.count()
UserGrupoK=UserGrupoK.sort_values(ascending=False, inplace=False, kind='quicksort')
import matplotlib.pyplot as plt
grupo=UserGrupoK.index.tolist()
valores=UserGrupoK.values.tolist()
plt.rcParams.update({'font.size': 5})
plt.plot(range(len(grupo)), valores, color='orange', marker='o')
plt.title("Distribucion de los usuarios")
plt.xlabel("Cluster")
plt.ylabel("Numero Usuarios")
plt.show()
from mpl_toolkits.mplot3d import axes3d
from sklearn import datasets
from sklearn.cluster import KMeans
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
num_components=3
pca = PCA(num_components)
principalComponents = pca.fit_transform(X)
num_components=principalComponents.shape[1]
explained_variance_ratio_=pca.explained_variance_ratio_
explained_variance_ratio_
a = range(num_components)
num_pc= a[::1]
principalDfN = pd.DataFrame(data = principalComponents, columns = num_pc)
principalDfN=round(principalDfN, 2)
clust_labels, cent = doKmeans(principalDfN, 4)
kmeans = pd.DataFrame(clust_labels)
finalDf = pd.concat([principalDfN, kmeans], axis = 1)
finalDf.columns = ['ComponentePrincipal1', 'ComponentePrincipal2', 'ComponentePrincipal3', 'target']
%matplotlib notebook
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation
fig = plt.figure()
ax1 = fig.add_subplot(111, projection='3d')
def init():
ax1.set_xlabel('Componente Principal 1', fontsize=10)
ax1.set_ylabel('Componente Principal 2', fontsize=10)
ax1.set_zlabel('Componente Principal 3', fontsize=10)
ax1.set_title('PCA 3D',fontsize=15)
targets=[0,1,2,3]
targetsNom=['Grupo 1', 'Grupo 2','Grupo 3', 'Grupo 4']
colors=['blue','green','red', 'yellow']
for target, color in zip(targets,colors):
indicesToKeep = finalDf['target'] == target
ax1.scatter(finalDf.loc[indicesToKeep, 'ComponentePrincipal1']
, finalDf.loc[indicesToKeep, 'ComponentePrincipal2']
, finalDf.loc[indicesToKeep, 'ComponentePrincipal3']
, c=color
)
ax1.legend(targetsNom)
ax1.grid()
return fig,
def animate(i):
ax1.view_init(elev=30., azim=3.6*i)
return fig,
ani = animation.FuncAnimation(fig, animate, init_func=init,
frames=100, interval=100, blit=True)
clust_labels, cent = doKmeans(principalDfN, 4)
kmeans = pd.DataFrame(clust_labels)
kmeans
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(principalDfN[0],principalDfN[1],c=kmeans[0],s=50)
ax.set_title('K-Means Clustering')
ax.set_xlabel('1 FACTOR')
ax.set_ylabel('2 FACTOR')
plt.colorbar(scatter)
from sklearn.decomposition import PCA
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
datasetAdolecencia.drop('covid19_positivo',1)
XA=datasetAdolecencia
n_samples = XA.shape[0]
n_features = XA.shape[1]
#Primera forma: para obtener el porcentaje de varianza obtenido con cada componente
print('********** Primera Forma')
num_components = min(n_samples, n_features)
#num_components=num_componentsSeleccionados
#Se mantiene el 85% de la varianza de los datos
pca = PCA(n_components=.85)
X_transformed = pca.fit_transform(XA)#Esta es la matriz de componentes principales
eigenvalues = pca.explained_variance_
num_componentsSeleccionados = X_transformed.shape[1]
print(num_componentsSeleccionados)
print('Varianzas:')
print(eigenvalues.round(3))
explained_variance_ratio_=pca.explained_variance_ratio_
print('Porcentaje de varianza de cada dimension con respecto a la varianza total:')
print(explained_variance_ratio_.round(3))
#Se obtiene el porcentaje acumulado de varianza
print('Porcentaje acumulado de varianza:')
explained_variance_ratio_cumsum=explained_variance_ratio_.cumsum()
print(explained_variance_ratio_cumsum.round(3))
print("El Numero de componentes seleccionados es: ", num_componentsSeleccionados)
num_pc= num_componentsSeleccionados
a = range(1,num_pc+1)
num_pc= a[::1]
plt.figure(figsize=(12,10))
plt.subplot(2, 2, 1)
plt.plot(num_pc, explained_variance_ratio_, color='orange', linestyle='dashed', linewidth = 3,
marker='o', markerfacecolor='blue', markersize=5, label = 'Valor Varianza')
plt.title('Varianza con '+str(num_componentsSeleccionados)+' Componenentes Principales')
plt.xlabel('N Componentes Principales')
plt.ylabel('Varianza')
plt.legend(loc="upper right")
plt.grid()
plt.subplot(2, 2, 2)
plt.plot(num_pc, explained_variance_ratio_cumsum, color='orange', linestyle='dashed', linewidth = 3,
marker='o', markerfacecolor='blue', markersize=5, label = 'Valor Varianzas Acumuladas')
plt.title('Varianza con '+str(num_componentsSeleccionados)+' Componenentes Principales')
plt.xlabel('N Componentes Principales')
plt.ylabel('Varianza')
plt.legend(loc="upper left")
plt.grid()
plt.show()
principalDfA = pd.DataFrame(data = X_transformed
, columns = num_pc)
principalDfA.describe()
principalDfA['Salida']=datasetAdolecencia['covid19_positivo'].values
XSA = principalDfA.drop('Salida',1)
ySA = principalDfA['Salida']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XSA, ySA, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.decomposition import PCA
pca = PCA()
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(PCAX_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(PCAX_test)
y_pred
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy: ' + str(accuracy_score(y_test, y_pred)))
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
PDAaccuracy= []
PDAncomponents=[]
for i in range(1,22):
pca = PCA(n_components=i);
PCAX_train = pca.fit_transform(X_train);
PCAX_test = pca.transform(X_test);
explained_variance = pca.explained_variance_ratio_;
classifier = RandomForestClassifier(max_depth=2, random_state=0);
classifier.fit(PCAX_train, y_train);
y_pred = classifier.predict(PCAX_test);
PDAncomponents.append(i);
cm = confusion_matrix(y_test, y_pred);
PDAaccuracy.append(accuracy_score(y_test, y_pred));
AccuracyMaximo = max(PDAaccuracy, key=float);
numComponente = PDAaccuracy.index(AccuracyMaximo);
print("Num. Componentes = ", numComponente+1, "AcuraccyMaximo = ", AccuracyMaximo)
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn import metrics
from time import time
X=principalDfA
K_range=range(1,22,1)
distortions=[]
timeTotal=[]
for i in K_range:
timeI=time()
kmeanModel = KMeans(n_clusters=i,init='k-means++')
kmeanModel.fit(principalDfA)
distortions.append(kmeanModel.inertia_)
timeF=time()
timeC= timeF - timeI
timeTotal.append(timeC)
fig1=plt.figure()
ex = fig1.add_subplot(111)
ex.plot(K_range, distortions, 'b*-')
plt.grid(True)
plt.xlabel('Numeros de clusters')
plt.ylabel('Distorcion de la Media')
plt.title('Eleccion del Mejor K con el Metodo de Elbow')
fig2=plt.figure()
plt.plot(K_range, timeTotal, 'b*-', color="red")
plt.grid(True)
plt.xlabel('Numero de Clusters')
plt.ylabel('Tiempo de Ejecucion (segundos)')
plt.title('Tiempo de Clusters')
def doKmeans(x, nclust=8, init='k-means++',max_iter=100, tol=0.0001, random_state=10, algorithm='full'):
model = KMeans(nclust)
model.fit(x)
clust_labels = model.predict(x)
cent = model.cluster_centers_
return (clust_labels, cent)
clust_labels, cent = doKmeans(principalDfA, 40, init='k-means++',max_iter=100, tol=0.0001, random_state=10, algorithm='full')
kmeans = pd.DataFrame(clust_labels,columns=['Grupos'])
kmeans
UserGrupoK=kmeans.groupby(kmeans.Grupos).Grupos.count()
UserGrupoK=UserGrupoK.sort_values(ascending=False, inplace=False, kind='quicksort')
import matplotlib.pyplot as plt
grupo=UserGrupoK.index.tolist()
valores=UserGrupoK.values.tolist()
plt.rcParams.update({'font.size': 5})
plt.plot(range(len(grupo)), valores, color='black', marker='o')
plt.title("Distribucion de los usuarios")
plt.xlabel("Cluster")
plt.ylabel("Numero Usuarios")
plt.show()
from mpl_toolkits.mplot3d import axes3d
from sklearn import datasets
from sklearn.cluster import KMeans
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
num_components=3
pca = PCA(num_components)
principalComponents = pca.fit_transform(X)
num_components=principalComponents.shape[1]
explained_variance_ratio_=pca.explained_variance_ratio_
explained_variance_ratio_
a = range(num_components)
num_pc= a[::1]
principalDfA = pd.DataFrame(data = principalComponents, columns = num_pc)
principalDfA=round(principalDfA, 2)
clust_labels, cent = doKmeans(principalDfA, 8)
kmeans = pd.DataFrame(clust_labels)
finalDfA = pd.concat([principalDfA, kmeans], axis = 1)
finalDfA.columns = ['ComponentePrincipal1', 'ComponentePrincipal2', 'ComponentePrincipal3', 'target']
%matplotlib notebook
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation
fig = plt.figure()
ax1 = fig.add_subplot(111, projection='3d')
def init():
ax1.set_xlabel('Componente Principal 1', fontsize=10)
ax1.set_ylabel('Componente Principal 2', fontsize=10)
ax1.set_zlabel('Componente Principal 3', fontsize=10)
ax1.set_title('PCA 3D',fontsize=15)
targets=[0,1,2,3,4,5,6,7]
targetsNom=['Grupo 1', 'Grupo 2','Grupo 3', 'Grupo 4','Grupo 5', 'Grupo 6','Grupo 7', 'Grupo 8']
colors=['blue','green','red', 'yellow','cyan','pink','brown','black']
for target, color in zip(targets,colors):
indicesToKeep = finalDfA['target'] == target
ax1.scatter(finalDfA.loc[indicesToKeep, 'ComponentePrincipal1']
, finalDfA.loc[indicesToKeep, 'ComponentePrincipal2']
, finalDfA.loc[indicesToKeep, 'ComponentePrincipal3']
, c=color
)
ax1.legend(targetsNom)
ax1.grid()
return fig,
def animate(i):
ax1.view_init(elev=30., azim=3.6*i)
return fig,
ani = animation.FuncAnimation(fig, animate, init_func=init,
frames=100, interval=100, blit=True)
clust_labels, cent = doKmeans(principalDfA, 8)
kmeans = pd.DataFrame(clust_labels)
kmeans
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(principalDfA[0],principalDfA[1],c=kmeans[0],s=50)
ax.set_title('K-Means Clustering')
ax.set_xlabel('1 FACTOR')
ax.set_ylabel('2 FACTOR')
plt.colorbar(scatter)
from sklearn.decomposition import PCA
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
datasetJuventud.drop('covid19_positivo',1)
XJ=datasetJuventud
n_samples = XJ.shape[0]
n_features = XJ.shape[1]
#Primera forma: para obtener el porcentaje de varianza obtenido con cada componente
print('********** Primera Forma')
num_components = min(n_samples, n_features)
#num_components=num_componentsSeleccionados
#Se mantiene el 90% de la varianza de los datos
pca = PCA(n_components=.85)
X_transformed = pca.fit_transform(XJ)#Esta es la matriz de componentes principales
eigenvalues = pca.explained_variance_
num_componentsSeleccionados = X_transformed.shape[1]
print(num_componentsSeleccionados)
print('Varianzas:')
print(eigenvalues.round(3))
explained_variance_ratio_=pca.explained_variance_ratio_
print('Porcentaje de varianza de cada dimension con respecto a la varianza total:')
print(explained_variance_ratio_.round(3))
#Se obtiene el porcentaje acumulado de varianza
print('Porcentaje acumulado de varianza:')
explained_variance_ratio_cumsum=explained_variance_ratio_.cumsum()
print(explained_variance_ratio_cumsum.round(3))
print("El Numero de componentes seleccionados es: ", num_componentsSeleccionados)
num_pc= num_componentsSeleccionados
a = range(1,num_pc+1)
num_pc= a[::1]
plt.figure(figsize=(12,10))
plt.subplot(2, 2, 1)
plt.plot(num_pc, explained_variance_ratio_, color='orange', linestyle='dashed', linewidth = 3,
marker='o', markerfacecolor='blue', markersize=5, label = 'Valor Varianzas')
plt.title('Varianza con '+str(num_componentsSeleccionados)+' Componenentes Principales')
plt.xlabel('N Componentes Principales')
plt.ylabel('Varianza')
plt.legend(loc="upper right")
plt.grid()
plt.subplot(2, 2, 2)
plt.plot(num_pc, explained_variance_ratio_cumsum, color='orange', linestyle='dashed', linewidth = 3,
marker='o', markerfacecolor='blue', markersize=5, label = 'Valor Varianzas Acumulativas')
plt.title('Varianza con '+str(num_componentsSeleccionados)+' Componenentes Principales')
plt.xlabel('N Componentes Principales')
plt.ylabel('Varianza')
plt.legend(loc="upper left")
plt.grid()
plt.show()
principalDfJ = pd.DataFrame(data = X_transformed
, columns = num_pc)
principalDfJ.describe()
principalDfJ['Salida']=datasetJuventud['covid19_positivo'].values
XSJ = principalDfJ.drop('Salida',1)
ySJ = principalDfJ['Salida']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XSJ, ySJ, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.decomposition import PCA
pca = PCA()
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(PCAX_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(PCAX_test)
y_pred
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy: ' + str(accuracy_score(y_test, y_pred)))
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
PDAaccuracy= []
PDAncomponents=[]
for i in range(1,22):
pca = PCA(n_components=i);
PCAX_train = pca.fit_transform(X_train);
PCAX_test = pca.transform(X_test);
explained_variance = pca.explained_variance_ratio_;
classifier = RandomForestClassifier(max_depth=2, random_state=0);
classifier.fit(PCAX_train, y_train);
y_pred = classifier.predict(PCAX_test);
PDAncomponents.append(i);
cm = confusion_matrix(y_test, y_pred);
PDAaccuracy.append(accuracy_score(y_test, y_pred));
AccuracyMaximo = max(PDAaccuracy, key=float);
numComponente = PDAaccuracy.index(AccuracyMaximo);
print("Num. Componentes = ", numComponente+1, "AcuraccyMaximo = ", AccuracyMaximo)
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn import metrics
from time import time
X=principalDfJ
K_range=range(1,22,1)
distortions=[]
timeTotal=[]
for i in K_range:
timeI=time()
kmeanModel = KMeans(n_clusters=i,init='k-means++')
kmeanModel.fit(principalDfJ)
distortions.append(kmeanModel.inertia_)
timeF=time()
timeC= timeF - timeI
timeTotal.append(timeC)
fig1=plt.figure()
ex = fig1.add_subplot(111)
ex.plot(K_range, distortions, 'b*-')
plt.grid(True)
plt.xlabel('Numeros de clusters')
plt.ylabel('Distorcion de la Media')
plt.title('Eleccion del Mejor K con el Metodo de Elbow')
fig2=plt.figure()
plt.plot(K_range, timeTotal, 'b*-', color="red")
plt.grid(True)
plt.xlabel('Numero de Clusters')
plt.ylabel('Tiempo de Ejecucion (segundos)')
plt.title('Tiempo de Clusters')
def doKmeans(x, nclust=12, init='k-means++',max_iter=100, tol=0.0001, random_state=10, algorithm='full'):
model = KMeans(nclust)
model.fit(x)
clust_labels = model.predict(x)
cent = model.cluster_centers_
return (clust_labels, cent)
clust_labels, cent = doKmeans(principalDfJ, 40, init='k-means++',max_iter=100, tol=0.0001, random_state=10, algorithm='full')
kmeans = pd.DataFrame(clust_labels,columns=['Grupos'])
kmeans
UserGrupoK=kmeans.groupby(kmeans.Grupos).Grupos.count()
UserGrupoK=UserGrupoK.sort_values(ascending=False, inplace=False, kind='quicksort')
import matplotlib.pyplot as plt
grupo=UserGrupoK.index.tolist()
valores=UserGrupoK.values.tolist()
plt.rcParams.update({'font.size': 5})
plt.plot(range(len(grupo)), valores, color='black', marker='o')
plt.title("Distribucion de los usuarios")
plt.xlabel("Cluster")
plt.ylabel("Numero Usuarios")
plt.show()
from mpl_toolkits.mplot3d import axes3d
from sklearn import datasets
from sklearn.cluster import KMeans
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
num_components=3
pca = PCA(num_components)
principalComponents = pca.fit_transform(X)
num_components=principalComponents.shape[1]
explained_variance_ratio_=pca.explained_variance_ratio_
explained_variance_ratio_
a = range(num_components)
num_pc= a[::1]
principalDfJ = pd.DataFrame(data = principalComponents, columns = num_pc)
principalDfJ=round(principalDfJ, 2)
clust_labels, cent = doKmeans(principalDfJ, 12)
kmeans = pd.DataFrame(clust_labels)
finalDfJ = pd.concat([principalDfJ, kmeans], axis = 1)
finalDfJ.columns = ['ComponentePrincipal1', 'ComponentePrincipal2', 'ComponentePrincipal3', 'target']
%matplotlib notebook
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation
fig = plt.figure()
ax1 = fig.add_subplot(111, projection='3d')
def init():
ax1.set_xlabel('Componente Principal 1', fontsize=10)
ax1.set_ylabel('Componente Principal 2', fontsize=10)
ax1.set_zlabel('Componente Principal 3', fontsize=10)
ax1.set_title('PCA 3D',fontsize=15)
targets=[0,1,2,3,4,5,6,7,8,9,10,11]
targetsNom=['Grupo 1', 'Grupo 2','Grupo 3', 'Grupo 4','Grupo 5', 'Grupo 6','Grupo 7', 'Grupo 8', 'Grupo 9','Grupo 10', 'Grupo 11','Grupo 12']
colors=['blue','green','red', 'yellow',"cyan","brown","magenta","prink",'black', "orange","gray","fuchsia"]
for target, color in zip(targets,colors):
indicesToKeep = finalDfJ['target'] == target
ax1.scatter(finalDfJ.loc[indicesToKeep, 'ComponentePrincipal1']
, finalDfJ.loc[indicesToKeep, 'ComponentePrincipal2']
, finalDfJ.loc[indicesToKeep, 'ComponentePrincipal3']
, c=color
)
ax1.legend(targetsNom)
ax1.grid()
return fig,
def animate(i):
ax1.view_init(elev=30., azim=3.6*i)
return fig,
ani = animation.FuncAnimation(fig, animate, init_func=init,
frames=100, interval=100, blit=True)
clust_labels, cent = doKmeans(principalDfJ, 12)
kmeans = pd.DataFrame(clust_labels)
kmeans
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(principalDfJ[0],principalDfJ[1],c=kmeans[0],s=50)
ax.set_title('K-Means Clustering')
ax.set_xlabel('1 FACTOR')
ax.set_ylabel('2 FACTOR')
plt.colorbar(scatter)
from sklearn.decomposition import PCA
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
datasetAdultez.drop('covid19_positivo',1)
XAZ=datasetAdultez
n_samples = XAZ.shape[0]
n_features = XAZ.shape[1]
#Primera forma: para obtener el porcentaje de varianza obtenido con cada componente
print('********** Primera Forma')
num_components = min(n_samples, n_features)
#num_components=num_componentsSeleccionados
#Se mantiene el 90% de la varianza de los datos
pca = PCA(n_components=.85)
X_transformed = pca.fit_transform(XAZ)#Esta es la matriz de componentes principales
eigenvalues = pca.explained_variance_
num_componentsSeleccionados = X_transformed.shape[1]
print(num_componentsSeleccionados)
print('Varianzas:')
print(eigenvalues.round(3))
explained_variance_ratio_=pca.explained_variance_ratio_
print('Porcentaje de varianza de cada dimension con respecto a la varianza total:')
print(explained_variance_ratio_.round(3))
#Se obtiene el porcentaje acumulado de varianza
print('Porcentaje acumulado de varianza:')
explained_variance_ratio_cumsum=explained_variance_ratio_.cumsum()
print(explained_variance_ratio_cumsum.round(3))
print("El Numero de componentes seleccionados es: ", num_componentsSeleccionados)
num_pc= num_componentsSeleccionados
a = range(1,num_pc+1)
num_pc= a[::1]
plt.figure(figsize=(12,10))
plt.subplot(2, 2, 1)
plt.plot(num_pc, explained_variance_ratio_, color='orange', linestyle='dashed', linewidth = 3,
marker='o', markerfacecolor='blue', markersize=5, label = 'Valor Varianzas')
plt.title('Varianza con '+str(num_componentsSeleccionados)+' Componenentes Principales')
plt.xlabel('N Componentes Principales')
plt.ylabel('Varianza')
plt.legend(loc="upper right")
plt.grid()
plt.subplot(2, 2, 2)
plt.plot(num_pc, explained_variance_ratio_cumsum, color='orange', linestyle='dashed', linewidth = 3,
marker='o', markerfacecolor='blue', markersize=5, label = 'Valor Varianzas Acumulativas')
plt.title('Varianza con '+str(num_componentsSeleccionados)+' Componenentes Principales')
plt.xlabel('N Componentes Principales')
plt.ylabel('Varianza')
plt.legend(loc="upper left")
plt.grid()
plt.show()
principalDfAZ = pd.DataFrame(data = X_transformed
, columns = num_pc)
principalDfAZ.describe()
principalDfAZ['Salida']=datasetAdultez['covid19_positivo'].values
XAZ = principalDfAZ.drop('Salida',1)
yAZ = principalDfAZ['Salida']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XAZ, yAZ, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.decomposition import PCA
pca = PCA()
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(PCAX_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(PCAX_test)
y_pred
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy: ' + str(accuracy_score(y_test, y_pred)))
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
PDAaccuracy= []
PDAncomponents=[]
for i in range(1,22):
pca = PCA(n_components=i);
PCAX_train = pca.fit_transform(X_train);
PCAX_test = pca.transform(X_test);
explained_variance = pca.explained_variance_ratio_;
classifier = RandomForestClassifier(max_depth=2, random_state=0);
classifier.fit(PCAX_train, y_train);
y_pred = classifier.predict(PCAX_test);
PDAncomponents.append(i);
cm = confusion_matrix(y_test, y_pred);
PDAaccuracy.append(accuracy_score(y_test, y_pred));
AccuracyMaximo = max(PDAaccuracy, key=float);
numComponente = PDAaccuracy.index(AccuracyMaximo);
print("Num. Componentes = ", numComponente+1, "AcuraccyMaximo = ", AccuracyMaximo)
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn import metrics
from time import time
X=principalDfAZ
K_range=range(1,22,1)
distortions=[]
timeTotal=[]
for i in K_range:
timeI=time()
kmeanModel = KMeans(n_clusters=i,init='k-means++')
kmeanModel.fit(principalDfAZ)
distortions.append(kmeanModel.inertia_)
timeF=time()
timeC= timeF - timeI
timeTotal.append(timeC)
fig1=plt.figure()
ex = fig1.add_subplot(111)
ex.plot(K_range, distortions, 'b*-')
plt.grid(True)
plt.xlabel('Numeros de clusters')
plt.ylabel('Distorcion de la Media')
plt.title('Eleccion del Mejor K con el Metodo de Elbow')
fig2=plt.figure()
plt.plot(K_range, timeTotal, 'b*-', color="red")
plt.grid(True)
plt.xlabel('Numero de Clusters')
plt.ylabel('Tiempo de Ejecucion (segundos)')
plt.title('Tiempo de Clusters')
def doKmeans(x, nclust=2, init='k-means++',max_iter=100, tol=0.0001, random_state=10, algorithm='full'):
model = KMeans(nclust)
model.fit(x)
clust_labels = model.predict(x)
cent = model.cluster_centers_
return (clust_labels, cent)
clust_labels, cent = doKmeans(principalDfAZ, 40, init='k-means++',max_iter=100, tol=0.0001, random_state=10, algorithm='full')
kmeans = pd.DataFrame(clust_labels,columns=['Grupos'])
kmeans
UserGrupoK=kmeans.groupby(kmeans.Grupos).Grupos.count()
UserGrupoK=UserGrupoK.sort_values(ascending=False, inplace=False, kind='quicksort')
import matplotlib.pyplot as plt
grupo=UserGrupoK.index.tolist()
valores=UserGrupoK.values.tolist()
plt.rcParams.update({'font.size': 5})
plt.plot(range(len(grupo)), valores, color='black', marker='o')
plt.title("Distribucion de los usuarios")
plt.xlabel("Cluster")
plt.ylabel("Numero Usuarios")
plt.show()
from mpl_toolkits.mplot3d import axes3d
from sklearn import datasets
from sklearn.cluster import KMeans
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
num_components=3
pca = PCA(num_components)
principalComponents = pca.fit_transform(X)
num_components=principalComponents.shape[1]
explained_variance_ratio_=pca.explained_variance_ratio_
explained_variance_ratio_
a = range(num_components)
num_pc= a[::1]
principalDfAZ = pd.DataFrame(data = principalComponents, columns = num_pc)
principalDfAZ=round(principalDfAZ, 2)
clust_labels, cent = doKmeans(principalDfAZ, 10)
kmeans = pd.DataFrame(clust_labels)
finalDfAZ = pd.concat([principalDfAZ, kmeans], axis = 1)
finalDfAZ.columns = ['ComponentePrincipal1', 'ComponentePrincipal2', 'ComponentePrincipal3', 'target']
%matplotlib notebook
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation
fig = plt.figure()
ax1 = fig.add_subplot(111, projection='3d')
def init():
ax1.set_xlabel('Componente Principal 1', fontsize=10)
ax1.set_ylabel('Componente Principal 2', fontsize=10)
ax1.set_zlabel('Componente Principal 3', fontsize=10)
ax1.set_title('PCA 3D',fontsize=15)
targets=[0,1]
targetsNom=['Grupo 1', 'Grupo 2']
colors=['blue','green']
for target, color in zip(targets,colors):
indicesToKeep = finalDfAZ['target'] == target
ax1.scatter(finalDfAZ.loc[indicesToKeep, 'ComponentePrincipal1']
, finalDfAZ.loc[indicesToKeep, 'ComponentePrincipal2']
, finalDfAZ.loc[indicesToKeep, 'ComponentePrincipal3']
, c=color
)
ax1.legend(targetsNom)
ax1.grid()
return fig,
def animate(i):
ax1.view_init(elev=30., azim=3.6*i)
return fig,
ani = animation.FuncAnimation(fig, animate, init_func=init,
frames=100, interval=100, blit=True)
clust_labels, cent = doKmeans(principalDfAZ, 2)
kmeans = pd.DataFrame(clust_labels)
kmeans
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(principalDfAZ[0],principalDfAZ[1],c=kmeans[0],s=50)
ax.set_title('K-Means Clustering')
ax.set_xlabel('1 FACTOR')
ax.set_ylabel('2 FACTOR')
plt.colorbar(scatter)
from sklearn.decomposition import PCA
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
datasetPMayor.drop('covid19_positivo',1)
XM=datasetPMayor
n_samples = XM.shape[0]
n_features = XM.shape[1]
#Primera forma: para obtener el porcentaje de varianza obtenido con cada componente
print('********** Primera Forma')
num_components = min(n_samples, n_features)
#num_components=num_componentsSeleccionados
#Se mantiene el 85% de la varianza de los datos
pca = PCA(n_components=.85)
X_transformed = pca.fit_transform(XM)#Esta es la matriz de componentes principales
eigenvalues = pca.explained_variance_
num_componentsSeleccionados = X_transformed.shape[1]
print(num_componentsSeleccionados)
print('Varianzas:')
print(eigenvalues.round(3))
explained_variance_ratio_=pca.explained_variance_ratio_
print('Porcentaje de varianza de cada dimension con respecto a la varianza total:')
print(explained_variance_ratio_.round(3))
#Se obtiene el porcentaje acumulado de varianza
print('Porcentaje acumulado de varianza:')
explained_variance_ratio_cumsum=explained_variance_ratio_.cumsum()
print(explained_variance_ratio_cumsum.round(3))
print("El Numero de componentes seleccionados es: ", num_componentsSeleccionados)
num_pc= num_componentsSeleccionados
a = range(1,num_pc+1)
num_pc= a[::1]
plt.figure(figsize=(12,10))
plt.subplot(2, 2, 1)
plt.plot(num_pc, explained_variance_ratio_, color='orange', linestyle='dashed', linewidth = 3,
marker='o', markerfacecolor='blue', markersize=5, label = 'Valor Varianzas')
plt.title('Varianza con '+str(num_componentsSeleccionados)+' Componenentes Principales')
plt.xlabel('N Componentes Principales')
plt.ylabel('Varianza')
plt.legend(loc="upper right")
plt.grid()
plt.subplot(2, 2, 2)
plt.plot(num_pc, explained_variance_ratio_cumsum, color='orange', linestyle='dashed', linewidth = 3,
marker='o', markerfacecolor='blue', markersize=8, label = 'Valor Varianzas Acumulativas')
plt.title('Varianza con '+str(num_componentsSeleccionados)+' Componenentes Principales')
plt.xlabel('N Componentes Principales')
plt.ylabel('Varianza')
plt.legend(loc="upper left")
plt.grid()
plt.show()
principalDfM = pd.DataFrame(data = X_transformed
, columns = num_pc)
principalDfM.describe()
principalDfM['Salida']=datasetPMayor['covid19_positivo'].values
XM = principalDfM.drop('Salida',1)
yM = principalDfM['Salida']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XM, yM, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.decomposition import PCA
pca = PCA()
PCAX_train = pca.fit_transform(X_train)
PCAX_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(PCAX_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(PCAX_test)
y_pred
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy: ' + str(accuracy_score(y_test, y_pred)))
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
PDAaccuracy= []
PDAncomponents=[]
for i in range(1,19):
pca = PCA(n_components=i);
PCAX_train = pca.fit_transform(X_train);
PCAX_test = pca.transform(X_test);
explained_variance = pca.explained_variance_ratio_;
classifier = RandomForestClassifier(max_depth=2, random_state=0);
classifier.fit(PCAX_train, y_train);
y_pred = classifier.predict(PCAX_test);
PDAncomponents.append(i);
cm = confusion_matrix(y_test, y_pred);
PDAaccuracy.append(accuracy_score(y_test, y_pred));
AccuracyMaximo = max(PDAaccuracy, key=float);
numComponente = PDAaccuracy.index(AccuracyMaximo);
print("Num. Componentes = ", numComponente+1, "AcuraccyMaximo = ", AccuracyMaximo)
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn import metrics
from time import time
X=principalDfM
K_range=range(1,19,1)
distortions=[]
timeTotal=[]
for i in K_range:
timeI=time()
kmeanModel = KMeans(n_clusters=i,init='k-means++')
kmeanModel.fit(principalDfM)
distortions.append(kmeanModel.inertia_)
timeF=time()
timeC= timeF - timeI
timeTotal.append(timeC)
fig1=plt.figure()
ex = fig1.add_subplot(111)
ex.plot(K_range, distortions, 'b*-')
plt.grid(True)
plt.xlabel('Numeros de clusters')
plt.ylabel('Distorcion de la Media')
plt.title('Eleccion del Mejor K con el Metodo de Elbow')
fig2=plt.figure()
plt.plot(K_range, timeTotal, 'b*-', color="red")
plt.grid(True)
plt.xlabel('Numero de Clusters')
plt.ylabel('Tiempo de Ejecucion (segundos)')
plt.title('Tiempo de Clusters')
def doKmeans(x, nclust=5, init='k-means++',max_iter=100, tol=0.0001, random_state=10, algorithm='full'):
model = KMeans(nclust)
model.fit(x)
clust_labels = model.predict(x)
cent = model.cluster_centers_
return (clust_labels, cent)
clust_labels, cent = doKmeans(principalDfM, 40, init='k-means++',max_iter=100, tol=0.0001, random_state=10, algorithm='full')
kmeans = pd.DataFrame(clust_labels,columns=['Grupos'])
kmeans
UserGrupoK=kmeans.groupby(kmeans.Grupos).Grupos.count()
UserGrupoK=UserGrupoK.sort_values(ascending=False, inplace=False, kind='quicksort')
import matplotlib.pyplot as plt
grupo=UserGrupoK.index.tolist()
valores=UserGrupoK.values.tolist()
plt.rcParams.update({'font.size': 5})
plt.plot(range(len(grupo)), valores, color='black', marker='o')
plt.title("Distribucion de los usuarios")
plt.xlabel("Cluster")
plt.ylabel("Numero Usuarios")
plt.show()
from mpl_toolkits.mplot3d import axes3d
from sklearn import datasets
from sklearn.cluster import KMeans
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
num_components=3
pca = PCA(num_components)
principalComponents = pca.fit_transform(X)
num_components=principalComponents.shape[1]
explained_variance_ratio_=pca.explained_variance_ratio_
explained_variance_ratio_
a = range(num_components)
num_pc= a[::1]
principalDfM = pd.DataFrame(data = principalComponents, columns = num_pc)
principalDfM=round(principalDfM, 2)
clust_labels, cent = doKmeans(principalDfM, 5)
kmeans = pd.DataFrame(clust_labels)
finalDfM = pd.concat([principalDfM, kmeans], axis = 1)
finalDfM.columns = ['ComponentePrincipal1', 'ComponentePrincipal2', 'ComponentePrincipal3', 'target']
%matplotlib notebook
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation
fig = plt.figure()
ax1 = fig.add_subplot(111, projection='3d')
def init():
ax1.set_xlabel('Componente Principal 1', fontsize=10)
ax1.set_ylabel('Componente Principal 2', fontsize=10)
ax1.set_zlabel('Componente Principal 3', fontsize=10)
ax1.set_title('PCA 3D',fontsize=15)
targets=[0,1,2,3,4]
colors=['green','red', 'yellow','blue','cyan','magenta']
targetsNom=['Grupo 1', 'Grupo 2','Grupo 3', 'Grupo 4','Grupo 5']
for target, color in zip(targets,colors):
indicesToKeep = finalDfM['target'] == target
ax1.scatter(finalDfM.loc[indicesToKeep, 'ComponentePrincipal1']
, finalDfM.loc[indicesToKeep, 'ComponentePrincipal2']
, finalDfM.loc[indicesToKeep, 'ComponentePrincipal3']
, c=color
)
ax1.legend(targetsNom)
ax1.grid()
return fig,
def animate(i):
ax1.view_init(elev=30., azim=3.6*i)
return fig,
ani = animation.FuncAnimation(fig, animate, init_func=init,
frames=100, interval=100, blit=True)
clust_labels, cent = doKmeans(principalDfM, 5)
kmeans = pd.DataFrame(clust_labels)
kmeans
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(principalDfM[0],principalDfM[1],c=kmeans[0],s=50)
ax.set_title('K-Means Clustering')
ax.set_xlabel('1 FACTOR')
ax.set_ylabel('2 FACTOR')
plt.colorbar(scatter)